import math
import random
from pprint import pprint
import pandas as pd
import numpy as np
from subprocess import call
from IPython.display import Image
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from rgf import RGFRegressor
from rgf import RGFClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from treeinterpreter import treeinterpreter as ti
from sklearn.tree import export_graphviz
#!conda install -c districtdatalabs yellowbrick --yes
#!pip install rgf_python -v
#!pip install treeinterpreter
boston = load_boston()
print(boston['DESCR'])
bostonDF = pd.DataFrame(boston['data'], columns=boston['feature_names'])
bostonDF['TARGET'] = boston['target']
bostonDF.head()
X = bostonDF.drop('TARGET', axis=1)
y = bostonDF['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
featuresNumeric = [
'CRIM',
'ZN',
'INDUS',
'CHAS',
'NOX',
'RM',
'AGE',
'DIS',
'RAD',
'TAX',
'PTRATIO',
'B',
'LSTAT'
]
transformerNumeric = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median'))
])
featuresCategorical = []
transformerCategorical = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('numeric', transformerNumeric, featuresNumeric),
('categorical', transformerCategorical, featuresCategorical)
])
def metrics(truth, predictions):
mae = mean_absolute_error(truth, predictions)
mse = mean_squared_error(truth, predictions)
rmse = math.sqrt(mse)
# Print out error for predictions
print('MAE : {:.3f}'.format(mae))
print('MSE : {:.3f}'.format(mse))
print('RMSE : {:.3f}'.format(rmse))
# Compute Accuracy
errors = abs(truth - predictions)
mape = 100 * (errors / truth)
accuracy = 100 - np.mean(mape)
print('Accuracy : {:.2f}%'.format(accuracy))
return (mae, mse, rmse, accuracy)
resultsDF = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE', 'Accuracy'])
dt = DecisionTreeRegressor(random_state=301)
dtPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', dt)
])
dtPipe.fit(X_train, y_train)
preds = dtPipe.predict(X_test)
print('Score Train {:.3f}'.format(dtPipe.score(X_train, y_train)))
print('Score Test {:.3f}'.format(dtPipe.score(X_test, y_test)))
maeDT, mseDT, rmseDT, accuracyDT = metrics(y_test, preds)
resultsDF.loc['Decision Tree'] = [maeDT, mseDT, rmseDT, accuracyDT]
plt.figure(figsize=(15,10))
visualizer = ResidualsPlot(dtPipe)
visualizer.fit(X_train, y_train) # Fit the training data to the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.poof()
export_graphviz(dt, out_file='dtree.dot',
feature_names = X_train.columns,
rounded = True, proportion = False,
precision = 2, filled = True, max_depth=2)
call(['dot', '-Tpng', 'dtree.dot', '-o', 'dtree.png', '-Gdpi=600'])
Image(filename = './dtree.png')
X_sample = pd.DataFrame(X_test.copy().reset_index(drop=True).loc[0]).T
predicted = dtPipe.predict(X_sample)
sample_id = 0
n_nodes = dt.tree_.node_count
children_left = dt.tree_.children_left
children_right = dt.tree_.children_right
feature = dt.tree_.feature
threshold = dt.tree_.threshold
node_indicator = dt.decision_path([X_sample.iloc[0]])
leave_id = dt.apply([X_sample.iloc[0]])
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
node_indicator.indptr[sample_id + 1]]
print('Values for Sample\n', X_sample.T, '\n')
print('Rules used to predict')
for node_id in node_index:
if leave_id[sample_id] == node_id:
continue
if X_sample.iloc[sample_id, feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
print("Node[%3s] %10s = %10s %3s %10s"
% (node_id,
X_test.columns[feature[node_id]],
X_test.iloc[sample_id, feature[node_id]],
threshold_sign,
threshold[node_id]))
print('Predicted : %s' % predicted)
print('Number of leaves in decision tree %s' % dt.get_n_leaves())
print('Depth of decision tree %s' % dt.get_depth())
dtReg = DecisionTreeRegressor(max_depth=10, max_leaf_nodes=200, random_state=101)
dtRegPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', dtReg)
])
dtRegPipe.fit(X_train, y_train)
preds = dtRegPipe.predict(X_test)
print('Score Train {:.3f}'.format(dtRegPipe.score(X_train, y_train)))
print('Score Test {:.3f}'.format(dtRegPipe.score(X_test, y_test)))
maeDT, mseDT, rmseDT, accuracyDT = metrics(y_test, preds)
resultsDF.loc['Regularized Decision Tree'] = [maeDT, mseDT, rmseDT, accuracyDT]
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
# Create the random grid for feature sealection
random_grid = {'n_estimators' : n_estimators,
'max_features' : max_features,
'min_samples_split' : min_samples_split,
'min_samples_leaf' : min_samples_leaf}
pprint(random_grid)
rfSearch = RandomForestRegressor()
rfRandomSearch = RandomizedSearchCV(estimator = rfSearch,
param_distributions = random_grid,
n_iter = 300,
verbose=2,
random_state=42,
n_jobs = -1)
# Fit the random search model
#rfRandomSearch.fit(X_train, y_train)
rfSearchPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', rfRandomSearch)
])
rfSearchPipe.fit(X_train, y_train)
rfRandomSearch.best_params_
rf = RandomForestRegressor(n_estimators = rfRandomSearch.best_params_['n_estimators'],
min_samples_split = rfRandomSearch.best_params_['min_samples_split'],
min_samples_leaf = rfRandomSearch.best_params_['min_samples_leaf'],
max_features = rfRandomSearch.best_params_['max_features'],
oob_score=True)
rfPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', rf)
])
rfPipe.fit(X_train, y_train)
print('OOB {:.3f}'.format(rf.oob_score_))
preds = rfPipe.predict(X_test)
print('Score Train {:.3f}'.format(rfPipe.score(X_train, y_train)))
print('Score Test {:.3f}'.format(rfPipe.score(X_test, y_test)))
plt.figure(figsize=(15,10))
visualizer = ResidualsPlot(rfPipe)
visualizer.fit(X_train, y_train) # Fit the training data to the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.poof()
maeRF, mseRF, rmseRF, accuracyRF = metrics(y_test, preds)
resultsDF.loc['Random Forest'] = [maeRF, mseRF, rmseRF, accuracyRF]
features = X_train.columns
importances = rf.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(15,10))
plt.title('Feature Importance')
plt.barh(range(len(indices)), importances[indices], color='b')
plt.yticks(range(len(indices)), features[indices])
plt.show()
est = rf.estimators_[0]
export_graphviz(est, out_file='rftree.dot',
feature_names = X_train.columns,
rounded = True, proportion = False,
precision = 2, filled = True, max_depth=2)
call(['dot', '-Tpng', 'rftree.dot', '-o', 'rftree.png', '-Gdpi=600'])
Image(filename = './rftree.png')
https://blog.datadive.net/prediction-intervals-for-random-forests/
def interval(model, X, percentile=95):
pi_down = []
pi_up = []
model_preds = []
for est in model.estimators_:
predictions = est.predict(X)
model_preds.append(predictions)
for pt in range(len(X)):
predicted = []
for mp in range(len(model_preds)):
predicted.append(model_preds[mp][pt])
pi_down.append(np.percentile(predicted, q=(100-percentile) / 2.))
pi_up.append(np.percentile(predicted, q=100-(100-percentile) / 2.))
return pi_down, pi_up
# Pull the preprocessing into an intervsal pipeline
intervalPipeline = Pipeline(steps=[
('preprocess', preprocessor)
])
# Generate the Intervals
intervalY = y_test.copy().reset_index(drop=True)
intervalX = X_test.copy().reset_index(drop=True)
intervalX = intervalPipeline.transform(intervalX)
intervalPred = rf.predict(intervalX)
percentile = 95
err_down, err_up = interval(rf, intervalX, percentile=percentile)
piDF = pd.DataFrame({'Predicted': intervalPred,
'Lower': err_down,
'Actual': intervalY,
'Upper': err_up})
piDF['Correct'] = np.where((piDF.Lower <= piDF.Actual) & (piDF.Actual <= piDF.Upper), 1, 0)
print('Percent in {}th prediction interval: {:.3f}'.format(percentile,
piDF.Correct.sum() / piDF.Correct.shape[0] * 100.0))
df = piDF.sort_values('Actual').reset_index()
plt.figure(figsize=(15,10))
plt.title('Prediction Interval {}th Percentile'.format(percentile))
plt.ylabel('Median Value')
plt.plot(df.Actual, label='Actual')
plt.plot(df.Lower, color='grey')
plt.plot(df.Upper, color='grey')
plt.scatter(df.index, df.Predicted)
plt.fill_between(df.index, df.Lower, df.Upper, color='grey', alpha=0.5)
plt.show()
prediction, bias, contributions = ti.predict(rf, X_test)
contribDF = pd.DataFrame({'Feature': X_test.columns,
'Contribution': contributions[0],
'Value':X_test.iloc[0]})
contribDF = contribDF.sort_values('Contribution').set_index('Feature')
print('Baseline (Y-Mean) / Bias of RF : {:.2f}\n'.format(bias[0]))
print(contribDF)
print('\nSum of Contribution {:.2f}\n'.format(contribDF.Contribution.sum()))
print('Predicted Value of RF : {:.2f}'.format(*prediction[0]))
print('Predicted Value of RF : {:.2f} = Bias {:.2f} + Contribution {:.2f}'.format(
*prediction[0], bias[0], contribDF.Contribution.sum()))
contribDF.plot.barh(y='Contribution', figsize=(12,10), color='C0')
plt.title('Feature Contribution for Prediction 0')
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
# Create the random grid for feature sealection
random_grid = {'n_estimators' : n_estimators,
'max_features' : max_features,
'min_samples_split' : min_samples_split,
'min_samples_leaf' : min_samples_leaf}
pprint(random_grid)
etSearch = ExtraTreesRegressor()
etRandomSearch = RandomizedSearchCV(estimator = etSearch,
param_distributions = random_grid,
n_iter = 300,
verbose=2,
random_state=42,
n_jobs = -1)
etSearchPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', etRandomSearch)
])
etSearchPipe.fit(X_train, y_train)
etRandomSearch.best_params_
et = ExtraTreesRegressor(n_estimators = etRandomSearch.best_params_['n_estimators'],
min_samples_split = rfRandomSearch.best_params_['min_samples_split'],
min_samples_leaf = rfRandomSearch.best_params_['min_samples_leaf'],
max_features = rfRandomSearch.best_params_['max_features'])
etPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', et)
])
etPipe.fit(X_train, y_train)
preds = etPipe.predict(X_test)
print('Score Train {:.3f}'.format(etPipe.score(X_train, y_train)))
print('Score Test {:.3f}'.format(etPipe.score(X_test, y_test)))
maeET, mseET, rmseET, accuracyET = metrics(y_test, preds)
resultsDF.loc['Extra Trees'] = [maeET, mseET, rmseET, accuracyET]
algorithm = ['RGF', 'RGF_Opt', 'RGF_Sib']
max_leaf = [250, 500, 750, 1000, 1500, 2000]
l2 = [0.01, 0.1, 1.0]
learning_rate = [0.1, 0.25, 0.5, 0.75, 1.0]
# Create the random grid for feature sealection
random_grid = {'algorithm' : algorithm,
'max_leaf' : max_leaf,
'l2' : l2,
'learning_rate' : learning_rate}
pprint(random_grid)
rgfSearch = RGFRegressor(test_interval=100, loss="LS", verbose=False)
rgfRandomSearch = RandomizedSearchCV(estimator = rgfSearch,
param_distributions = random_grid,
n_iter = 300,
verbose=2,
random_state=42,
n_jobs = -1)
rgfPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', rgfRandomSearch)
])
rgfPipe.fit(X_train, y_train)
rgfRandomSearch.best_params_
rgf = RGFRegressor(test_interval=100,
loss="LS",
verbose=False,
max_leaf = rgfRandomSearch.best_params_['max_leaf'],
learning_rate = rgfRandomSearch.best_params_['learning_rate'],
algorithm = rgfRandomSearch.best_params_['algorithm'],
l2 = rgfRandomSearch.best_params_['l2'])
rgfPipe = Pipeline(steps=[
('preprocess', preprocessor),
('classifier', rgf)
])
rgfPipe.fit(X_train, y_train)
y_pred = rgfPipe.fit(X_train, y_train).predict(X_test)
maeRGF, mseRGF, rmseRGF, accuracyRGF = metrics(y_test, y_pred)
resultsDF.loc['Regularized Greedy Forest'] = [maeRGF, mseRGF, rmseRGF, accuracyRGF]
plt.figure(figsize=(15,10))
visualizer = ResidualsPlot(rfPipe)
visualizer.fit(X_train, y_train) # Fit the training data to the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.poof()
impDF = pd.DataFrame({'Variables': X_train.columns, 'Importance': rgf.feature_importances_})
impDF.set_index('Variables').sort_values('Importance', ascending=True).plot.barh(figsize=(10,8))
plt.title('Variable Importance (Normalized)')
plt.legend(loc='lower right')
resultsDF.sort_values('MAE')
resultsDF.sort_values('RMSE')